getwd()
## [1] "/Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks"
source("../references/biostats.R")
list.of.packages <- c("DESeq2", "RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
load(file = "../results/gene-counts-filtered") #object = counts.filtered
ggplotly(
ggplot(data = data.frame(colSums(counts.filtered)) %>%
dplyr::rename(count.total = 1) %>%
rownames_to_column(var="sample") %>%
filter(grepl("_larvae", sample))) +
geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample, Larvae") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()))
### Load transformed counts object
load(file = "../results/gene-counts-trans") #object = counts.t
counts.t.larvae <- as.data.frame(counts.t[grepl(".larvae", rownames(counts.t)), ])
counts.t.larvae[,1:3] #take a peak at the resulting dataframe
## OLUR_00020575 OLUR_00006628 OLUR_00032161
## 39_larvae 3 65 238
## 401_larvae 1 69 263
## 402_larvae 2 58 218
## 403_larvae 0 72 420
## 404_larvae 0 52 929
## 411_larvae 1 101 495
## 413_larvae 1 47 593
## 414_larvae 0 67 24
## 421_larvae 0 78 190
## 43_larvae 3 91 170
## 431b_larvae 0 97 687
## 434_larvae 0 18 430
## 442b_larvae 0 50 355
## 443_larvae 1 21 861
## 444_larvae 0 242 1573
## 445_larvae 0 97 444
## 451_larvae 1 147 1042
## 452b_larvae 0 95 589
## 453_larvae 4 60 182
## 461b_larvae 0 86 513
## 47_larvae 0 130 253
## 471b_larvae 0 99 576
## 472b_larvae 1 211 530
## 473_larvae 1 366 878
## 474_larvae 0 182 920
## 475_larvae 0 314 1270
## 476_larvae 0 193 1086
## 477_larvae 0 106 574
## 481_larvae 1 41 185
## 482_larvae 0 57 521
## 483_larvae 0 38 146
## 484_larvae 3 39 196
## 485_larvae 1 182 393
## 487_larvae 2 94 323
## 488_larvae 0 176 1156
## 489_larvae 0 98 485
## 490_larvae 1 76 324
## 491_larvae 0 57 425
## 492_larvae 3 25 371
## 506_larvae 0 9 294
## 513_larvae 0 68 112
## 522_larvae 0 91 234
## 523_larvae 1 123 934
## 524_larvae 0 57 998
## 525_larvae 0 64 184
## 526_larvae 2 119 304
## 527_larvae 1 51 107
## 528_larvae 0 165 740
## 529_larvae 0 89 645
## 531_larvae 3 106 280
## 532_larvae 1 50 262
## 533_larvae 0 47 389
## 541_larvae 0 71 660
## 542_larvae 0 20 155
## 543_larvae 1 85 419
## 551_larvae 0 257 966
## 552b_larvae 0 80 260
## 553_larvae 1 127 308
## 554_larvae 3 280 531
## 563_larvae 2 57 975
## 564_larvae 0 124 474
keep <- colSums(counts.t.larvae) >= 10
counts.ts.larvae <- counts.t.larvae[,keep]
print(paste("# genes remaining after pre-filtering:", ncol(counts.ts.larvae)))
## [1] "# genes remaining after pre-filtering: 30443"
print(paste("# of genes dropped:", ncol(counts.t.larvae) - ncol(counts.ts.larvae), sep=" "))
## [1] "# of genes dropped: 1767"
ggplotly(
ggplot(data = data.frame(rowSums(counts.t.larvae != 0)) %>%
dplyr::rename(count.total = 1) %>%
rownames_to_column(var="sample") %>%
filter(grepl("_larvae", sample))) +
geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total # genes by sample, larvae, pooled") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()))
# note: you'll need to press return in the console for all plots
#foa.plots(counts.ts.larvae)
# merge count data with sample key, reset row names as sample names, and arrange by infection, then temperature, then day
load(file="../raw-data/key")
counts.tsk.larvae <- merge(x=key, by.x="sample_stage", y=counts.ts.larvae, by.y="row.names") %>%
arrange(stage, population, pCO2.parent) %>% column_to_rownames(var="sample_stage")
head(counts.tsk.larvae[1:24]) #check out results of merge/arrange
## lane sample RNA.conc RNA.ng endpoint.RFU pcr.cycles prep.batch
## 401_larvae 2 401 99 350 127 16 5
## 402_larvae 2 402 6 350 88 16 4
## 403_larvae 2 403 12 350 68 14 5
## 404_larvae 2 404 5 350 31 15 4
## 421_larvae 2 421 60 288 24 14 4
## 411_larvae 2 411 77 350 112 16 5
## species stage tissue DNA.ng bioan.mean.bp
## 401_larvae Ostrea lurida larvae whole body, pooled 1.24 26
## 402_larvae Ostrea lurida larvae whole body, pooled 2.76 27
## 403_larvae Ostrea lurida larvae whole body, pooled 4.16 28
## 404_larvae Ostrea lurida larvae whole body, pooled 2.92 28
## 421_larvae Ostrea lurida larvae whole body, pooled 1.63 28
## 411_larvae Ostrea lurida larvae whole body, pooled 3.08 18
## index.no index.seq population temp.parent pCO2.parent larval.sample
## 401_larvae 7012 ATGAAC Dabob Bay 10 Ambient 14-A
## 402_larvae 7006 GTGTAG Dabob Bay 10 Ambient 31-A
## 403_larvae 7032 CGAAGG Dabob Bay 10 Ambient 75-A
## 404_larvae 7046 CTCCAT Dabob Bay 10 Ambient 80-A
## 421_larvae 7024 CCGCAA Dabob Bay 6 Ambient 59-A
## 411_larvae 7011 TTAACT Dabob Bay 10 High 23-A
## filename OLUR_00020575 OLUR_00006628 OLUR_00032161 OLUR_00011450
## 401_larvae 401 1 69 263 17
## 402_larvae 402 2 58 218 12
## 403_larvae 403 0 72 420 78
## 404_larvae 404 0 52 929 57
## 421_larvae 421 0 78 190 26
## 411_larvae 411 1 101 495 21
## OLUR_00018391
## 401_larvae 0
## 402_larvae 2
## 403_larvae 3
## 404_larvae 2
## 421_larvae 2
## 411_larvae 5
#counts.tsk.larvae %>% dplyr::select(starts_with("OLUR")) #this is code to get only the gene columns
NOTE: scale=“column” b/c range of counts is so huge, so counts have been scaled
pheatmap(data.matrix(counts.tsk.larvae %>% dplyr::select(starts_with("OLUR"))), Rowv=NA, Colv=NA, na.rm = TRUE, xlab = NA,
show_colnames =FALSE, cluster_cols = FALSE, cluster_rows = TRUE,
scale="column", color=c("dodgerblue3", "goldenrod1"),
main = "Oly Larvae Gene Counts", annotation_row=counts.tsk.larvae[,c("population", "pCO2.parent")],
filename = "../results/heatmap-larval-counts.png")
Resulting heat map is located in results/ directory:
NOTE: It is absolutely critical that the columns of the count matrix and the rows of the column data (information about samples) are in the same order. DESeq2 will not make guesses as to which column of the count matrix belongs to which row of the column data, these must be provided to DESeq2 already in consistent order.
all(rownames(counts.tsk.larvae) == counts.tsk.larvae %>% dplyr::select(starts_with("OLUR")) %>% t() %>% colnames()) #check that rownames of untransformed matrix match column names of transformed matrix. Should print 'TRUE'
## [1] TRUE
#counts.DESeq <- counts.tsk.larvae[-which(rownames(counts.tsk.larvae) %in% "571_larvae"), grepl("OLUR", colnames(counts.tsk.larvae))] %>% t()
#key.DESeq <- counts.tsk.larvae[-which(rownames(counts.tsk.larvae) %in% "571_larvae"),c("population", "pCO2.parent")]
dds.larvae <- DESeqDataSetFromMatrix(countData = counts.tsk.larvae[,grepl("OLUR", colnames(counts.tsk.larvae))] %>% t(),
colData = counts.tsk.larvae[,c("population", "pCO2.parent")] ,
design = ~ population + pCO2.parent)
## factor levels were dropped which had no samples
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
blind=FALSE b/c we are interested in differences explained by experimental design, and may wish to use this transformed data in downstream analyses.vsd.larvae <- varianceStabilizingTransformation(dds.larvae, blind=FALSE)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
NOTE: Hover over points to see the sample numbers
# PCA with points color coded by population
#ggplotly(
plotPCA(vsd.larvae, intgroup="population") +
ggtitle("PCA by population (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# PCA with points color coded by parental pCO2 exposure
#ggplotly(
plotPCA(vsd.larvae, intgroup="pCO2.parent") +
ggtitle("PCA by parental pCO2 exposure (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# PCA with points color coded by tissue and pco2 factors
#ggplotly(
plotPCA(vsd.larvae, intgroup=c("population","pCO2.parent")) +
ggtitle("PCA by population + parental pCO2 (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# extract treatment info from VSD transformation
#vsd.larvae.df <- as.data.frame(colData(vsd.larvae)[,c("population", "pCO2.parent")])
# generate heatmap from untransformed counts
#pheatmap(counts(dds.larvae), cluster_rows=FALSE, show_rownames=FALSE,
#cluster_cols=T, annotation_col=vsd.larvae.df, scale = "row", main="QuantSeq, untransformed data (but scaled by rows")
# generate heatmap from VSD counts
#pheatmap(assay(vsd.larvae), cluster_rows=FALSE, show_rownames=FALSE,
#cluster_cols=T, annotation_col=vsd.larvae.df, main = "QuantSeq, VSD-transformed")
Another use of the transformed data is sample clustering. Here, we apply the dist function to the transpose of the transformed count matrix to get sample-to-sample distances.
A heatmap of this distance matrix gives us an overview over similarities and dissimilarities between samples. We have to provide a hierarchical clustering hc to the heatmap function based on the sample distances, or else the heatmap function would calculate a clustering based on the distances between the rows/columns of the distance matrix.
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)
sampleDists <- dist(t(assay(vsd.larvae)))
sampleDistMatrix <- as.matrix(sampleDists)
# Here we show pCO2.parent + population
rownames(sampleDistMatrix) <- paste(vsd.larvae$population, vsd.larvae$pCO2.parent, sep="-") #set row names
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=rev(colors), fontsize = 6)
DESeq to assess differential expressiondds.larvae.DESeq <- DESeq(dds.larvae)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
print("Comparison: parental pCO2 - All populations")
## [1] "Comparison: parental pCO2 - All populations"
summary(res.larvae.pco2 <- results(dds.larvae.DESeq, contrast=c("pCO2.parent", "Ambient", "High"), alpha=0.05)) #not with p=0.05, but checking p=0.1 just because
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 0, 0%
## outliers [1] : 1, 0.0033%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pCO2, larvae, comparison across all pops:", sum(res.larvae.pco2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pCO2, larvae, comparison across all pops: 0"
# nope
# Here are all the possible contrasts I can make
levels(dds.larvae.DESeq$population)
## [1] "Dabob Bay" "Fidalgo Bay" "Oyster Bay C1" "Oyster Bay C2"
print("Comparison: Dabob Bay vs. Fidalgo Bay")
## [1] "Comparison: Dabob Bay vs. Fidalgo Bay"
summary(res.larvae.DBFB <- results(dds.larvae.DESeq, contrast=c("population", "Dabob Bay", "Fidalgo Bay"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 188, 0.62%
## LFC < 0 (down) : 127, 0.42%
## outliers [1] : 1, 0.0033%
## low counts [2] : 5312, 17%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Dabob Bay vs. Oyster Bay C1")
## [1] "Comparison: Dabob Bay vs. Oyster Bay C1"
summary(res.larvae.DBOB1 <- results(dds.larvae.DESeq, contrast=c("population", "Dabob Bay", "Oyster Bay C1"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 143, 0.47%
## LFC < 0 (down) : 125, 0.41%
## outliers [1] : 1, 0.0033%
## low counts [2] : 1771, 5.8%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Dabob Bay vs. Oyster Bay C2")
## [1] "Comparison: Dabob Bay vs. Oyster Bay C2"
summary(res.larvae.DBOB2 <- results(dds.larvae.DESeq, contrast=c("population", "Dabob Bay", "Oyster Bay C2"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 100, 0.33%
## LFC < 0 (down) : 63, 0.21%
## outliers [1] : 1, 0.0033%
## low counts [2] : 3542, 12%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Fidalgo Bay vs. Oyster Bay C1")
## [1] "Comparison: Fidalgo Bay vs. Oyster Bay C1"
summary(res.larvae.FBOB1 <- results(dds.larvae.DESeq, contrast=c("population", "Fidalgo Bay", "Oyster Bay C1"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 247, 0.81%
## LFC < 0 (down) : 414, 1.4%
## outliers [1] : 1, 0.0033%
## low counts [2] : 5312, 17%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Fidalgo Bay vs. Oyster Bay C2")
## [1] "Comparison: Fidalgo Bay vs. Oyster Bay C2"
summary(res.larvae.FBOB2 <- results(dds.larvae.DESeq, contrast=c("population", "Fidalgo Bay", "Oyster Bay C2"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 126, 0.41%
## LFC < 0 (down) : 128, 0.42%
## outliers [1] : 1, 0.0033%
## low counts [2] : 4722, 16%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: Oyster Bay C1 vs. Oyster Bay C2")
## [1] "Comparison: Oyster Bay C1 vs. Oyster Bay C2"
summary(res.larvae.OB1OB2 <- results(dds.larvae.DESeq, contrast=c("population", "Oyster Bay C1", "Oyster Bay C2"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 0, 0%
## outliers [1] : 1, 0.0033%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) between Dabob & Fidalgo larvae:", sum(res.larvae.DBFB$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Dabob & Fidalgo larvae: 315"
paste("No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C1 larvae:", sum(res.larvae.DBOB1$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C1 larvae: 268"
paste("No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C2 larvae:", sum(res.larvae.DBOB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Dabob & Oyster Bay C2 larvae: 163"
paste("No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C1 larvae:", sum(res.larvae.FBOB1$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C1 larvae: 661"
paste("No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C2 larvae:", sum(res.larvae.FBOB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Fidalgo & Oyster Bay C2 larvae: 254"
paste("No. of genes differentially expressed (padj<0.05) between Oyster Bay C1 & C2 larvae:", sum(res.larvae.OB1OB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) between Oyster Bay C1 & C2 larvae: 0"
diffex.larvae.DBFB <- subset(res.larvae.DBFB, padj < 0.05)
diffex.larvae.DBOB1 <- subset(res.larvae.DBOB1, padj < 0.05)
diffex.larvae.DBOB2 <- subset(res.larvae.DBOB2, padj < 0.05)
diffex.larvae.FBOB1 <- subset(res.larvae.FBOB1, padj < 0.05)
diffex.larvae.FBOB2 <- subset(res.larvae.FBOB2, padj < 0.05)
# note: no differences between OB1 and OB2
# save all R objects to file for integration
save(diffex.larvae.DBFB, file="../results/diffex.larvae.DBFB")
save(diffex.larvae.DBOB1, file="../results/diffex.larvae.DBOB1")
save(diffex.larvae.DBOB2, file="../results/diffex.larvae.DBOB2")
save(diffex.larvae.FBOB1, file="../results/diffex.larvae.FBOB1")
save(diffex.larvae.FBOB2, file="../results/diffex.larvae.FBOB2")
# merge and select unique genes for PCA
degs.larvae.pops <- c(rownames(diffex.larvae.DBFB),
rownames(diffex.larvae.DBOB1),
rownames(diffex.larvae.DBOB2),
rownames(diffex.larvae.FBOB1),
rownames(diffex.larvae.FBOB2)) %>% unique()
save(degs.larvae.pops, file="../results/degs.larvae.pops")
This PCA represents the physiological differences among populations/cohorts of Olympia oyster larvae, upon maternal liberation.
# PCA with points color coded by tissue and pco2 factors
#ggplotly(
plotPCA(vsd.larvae[degs.larvae.pops,], intgroup=c("population")) +
ggtitle("Larval PCA by population, DEGs (var-stabilizing transformed)") + geom_point(size=3, aes(text=colnames(vsd.larvae)))#, tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# generate heatmap of population DEGs
vsd.larvae.df <- as.data.frame(colData(vsd.larvae)[,c("population", "pCO2.parent")])
pheatmap(assay(vsd.larvae[degs.larvae.pops,]), cluster_rows=T, show_rownames=FALSE, show_colnames=FALSE,
cluster_cols=T, annotation_col=vsd.larvae.df[1], scale="row", main = "DEGs among populations")
# Add a factor that defines interaction group
dds.larvae.DESeq$group <- factor(paste0(dds.larvae.DESeq$population, dds.larvae.DESeq$pCO2.parent))
design(dds.larvae.DESeq) <- ~ group
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
dds.larvae.DESeq <- DESeq(dds.larvae.DESeq)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## -- replacing outliers and refitting for 4 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
Create results objects, but summary of results are also shown
NOTE: can only compare two treatments at a time
# Here are all the possible contrasts I can make
levels(dds.larvae.DESeq$group)
## [1] "Dabob BayAmbient" "Dabob BayHigh" "Fidalgo BayAmbient"
## [4] "Fidalgo BayHigh" "Oyster Bay C1Ambient" "Oyster Bay C1High"
## [7] "Oyster Bay C2Ambient" "Oyster Bay C2High"
print("Comparison: parental pCO2 - Fidalgo Bay")
## [1] "Comparison: parental pCO2 - Fidalgo Bay"
summary(res.larvae.FB <- results(dds.larvae.DESeq, contrast=c("group", "Fidalgo BayHigh", "Fidalgo BayAmbient"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 1, 0.0033%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: parental pCO2 - Dabob Bay")
## [1] "Comparison: parental pCO2 - Dabob Bay"
summary(res.larvae.DB <- results(dds.larvae.DESeq, contrast=c("group", "Dabob BayHigh", "Dabob BayAmbient"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 0, 0%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: parental pCO2 - Oyster Bay Cohort 1")
## [1] "Comparison: parental pCO2 - Oyster Bay Cohort 1"
summary(res.larvae.OB1 <- results(dds.larvae.DESeq, contrast=c("group", "Oyster Bay C1High", "Oyster Bay C1Ambient"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 0, 0%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
print("Comparison: parental pCO2 - Oyster Bay Cohort 2")
## [1] "Comparison: parental pCO2 - Oyster Bay Cohort 2"
summary(res.larvae.OB2 <- results(dds.larvae.DESeq, contrast=c("group", "Oyster Bay C2High", "Oyster Bay C2Ambient"), alpha=0.05))
##
## out of 30443 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 0, 0%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Fidalgo Bay larvae:", sum(res.larvae.FB$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Fidalgo Bay larvae: 1"
paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Dabob Bay larvae:", sum(res.larvae.DB$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Dabob Bay larvae: 0"
paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort1 larvae:", sum(res.larvae.OB1$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort1 larvae: 0"
paste("No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort2 larvae:", sum(res.larvae.OB2$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by parental pCO2, Oyster Bay Cohort2 larvae: 0"
diffex.larvae.FB <- subset(res.larvae.FB, padj < 0.05)
#diffex.larvae.DB <- subset(res.larvae.DB, padj < 0.05)
diffex.larvae.FB.counts <- subset(counts(dds.larvae.DESeq), rownames(dds.larvae.DESeq) %in% rownames(diffex.larvae.FB))
diffex.larvae.FB.counts <- diffex.larvae.FB.counts[,subset(key, stage=="larvae" & population=="Fidalgo Bay")$sample_stage]
It can also be useful to examine the counts of reads for a single gene across the groups. A simple function for making this plot is plotCounts, which normalizes counts by sequencing depth and adds a pseudocount of 1/2 to allow for log scale plotting. The counts are grouped by the variables in intgroup, where more than one variable can be specified. Here we specify the gene which had the smallest p value from the results table created above. You can select the gene to plot by rowname or by numeric index.
a <- plotCounts(dds.larvae.DESeq, gene=rownames(diffex.larvae.FB), intgroup=c("population", "pCO2.parent"), returnData = TRUE)
ggplot(subset(a, population=="Fidalgo Bay") %>% rownames_to_column("sample"),
aes(x=pCO2.parent, y=count, color=pCO2.parent, label=sample)) +
geom_point(position=position_jitter(w = 0.1,h = 0)) +
geom_text() +
theme_bw() +
ggtitle(rownames(diffex.larvae.FB)) +
theme(plot.title = element_text(hjust = 0.5))
======
# dds.larvae.df <- as.data.frame(colData(dds.larvae)[as.data.frame(colData(dds.larvae))$population==
# "Fidalgo Bay", c("population", "pCO2.parent")])
#
# pheatmap(diffex.larvae.FB.counts[,order(colnames(diffex.larvae.FB.counts))],
# cluster_rows=T, show_rownames=FALSE, cluster_columns=T, na.rm=TRUE, scale="row", main = "FB Gene Counts, all differentially expressed genes among pCO2", annotation_col=dds.larvae.df.FB[ order(row.names(dds.larvae.df.FB)), ][2], color=c("dodgerblue3", "goldenrod1"))
# dds.larvae.df.DB <- as.data.frame(colData(dds.larvae)[as.data.frame(colData(dds.larvae))$population==
# "Dabob Bay", c("population", "pCO2.parent")])
#
# pheatmap(diffex.larvae.DB.counts[,order(colnames(diffex.larvae.DB.counts))],
# cluster_rows=T, show_rownames=FALSE, cluster_columns=T, na.rm=TRUE, scale="row", main = "FB Gene Counts, all differentially expressed genes among pCO2", annotation_col=dds.larvae.df.DB[ order(row.names(dds.larvae.df.DB)), ][2], color=c("dodgerblue3", "goldenrod1"))
# larvae.FB.p05.names <-rownames(diffex.larvae.FB[order(diffex.larvae.FB$padj),])
# count_plots = list()
# #plot the 10 genes with lowest p-values
# for (i in 1:10) {
# a <- plotCounts(dds.larvae.DESeq, gene=larvae.FB.p05.names[i], intgroup=c("population", "pCO2.parent"), returnData = TRUE)
# b <- ggplot(subset(a, population=="Fidalgo Bay") %>% rownames_to_column("sample"),
# aes(x=pCO2.parent, y=count, color=pCO2.parent, label=sample)) +
# geom_point(position=position_jitter(w = 0.1,h = 0)) +
# geom_text() +
# theme_bw() +
# ggtitle(larvae.FB.p05.names[i]) +
# theme(plot.title = element_text(hjust = 0.5))
# count_plots[[i]] <- b
# }
# count_plots
# larvae.DB.p05.names <-rownames(diffex.larvae.DB[order(diffex.larvae.DB$padj),])
# count_plots = list()
# #plot the 10 genes with lowest p-values
# for (i in 1:10) {
# a <- plotCounts(dds.larvae.DESeq, gene=larvae.DB.p05.names[i], intgroup=c("population", "pCO2.parent"), returnData = TRUE)
# b <- ggplot(subset(a, population=="Dabob Bay") %>% rownames_to_column("sample"),
# aes(x=pCO2.parent, y=count, color=pCO2.parent, label=sample)) +
# geom_point(position=position_jitter(w = 0.1,h = 0)) +
# geom_text() +
# theme_bw() +
# ggtitle(larvae.DB.p05.names[i]) +
# theme(plot.title = element_text(hjust = 0.5))
# count_plots[[i]] <- b
# }
# count_plots
# diffex.all.counts <-
# rbind.data.frame(
# diffex.status.counts,
# diffex.ColdVSWarm.counts,
# diffex.AmbVSWarm.counts,
# diffex.ColdVSAmb.counts,
# diffex.9vs26.counts,
# diffex.9vs12.counts,
# diffex.12vs26.counts) %>%
# rownames_to_column("gene") %>%
# arrange(gene)
#
# # are there any duplicate genes? no.
# diffex.all.counts[duplicated(diffex.all.counts), ]
#
# # Move first column with gene names to row names
# diffex.all.counts <- diffex.all.counts %>%
# column_to_rownames("gene")
NOTE: set scale=false to use a variance-covariance matrix, putting more weight on genes with higher counts.
Notes from multivariate class notes: PCA is sensitive to the scale of measurement of the data. If all the data are not measured on the same scale, using covariance means that the result will be determined mostly by the variable with the largest values, as it will have the highest variance. Using a correlation matrix treats all variables the same (standardized to mean=0 and std. dev.=1). In prcomp(), this means specifying scale=TRUE in the function call.
#diff.pca<-FactoMineR::PCA(t(diffex.all.counts),graph=F) #note: need to transform count frame for this PCA
#fviz_screeplot(diff.pca, addlabels = TRUE)
# fviz_contrib(diff.pca, choice = "ind", axes = 1) + ggtitle("Contribution of samples to PC dimension #1")
# fviz_contrib(diff.pca, choice = "ind", axes = 2) + ggtitle("Contribution of samples to PC dimension #2")
# fviz_contrib(diff.pca, choice = "ind", axes = 3) + ggtitle("Contribution of samples to PC dimension #3")
# pca.key <- key[order(match(key$sample_stage, rownames(diff.pca$ind$coord))), ] #create key with samples ordered by same order as they are in the PCA
#
# # PCA plots with samples, color coded by treatments
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$infection_status, title = "PC1 ~ PC2, color=infection status")
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$temperature, title = "PC1 ~ PC2, color=temperature")
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind"), invisible = "var", col.ind = pca.key$day, title = "PC1 ~ PC2, color=day sampled")
#
# # PCA with samples + top 10 genes contributing to PC scores
# fviz_pca_biplot(diff.pca, axes = c(1,2), repel = TRUE, label = c("ind", "var"), select.var = list(contrib = 10))
# pca.key <- key[order(match(key$sample_stage, rownames(diff.pca$ind$coord))), ] #create key with samples ordered by same order as they are in the PCA
#
# # PCA plots with samples, color coded by treatments
# fviz_pca_biplot(diff.pca, axes = c(1,3), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$infection_status, title = "PC1 ~ PC3, color=infection status")
# fviz_pca_biplot(diff.pca, axes = c(1,3),repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$temperature, title = "PC1 ~ PC3, color=temperature")
# fviz_pca_biplot(diff.pca, axes = c(1,3),repel = TRUE, label = c("ind"), invisible = "var", col.ind = pca.key$day, title = "PC1 ~ PC3, color=day sampled")
#
# # PCA with samples + top 10 genes contributing to PC scores
# fviz_pca_biplot(diff.pca, axes = c(1,3),repel = TRUE, label = c("ind", "var"), select.var = list(contrib = 10))
# pca.key <- key[order(match(key$sample_stage, rownames(diff.pca$ind$coord))), ] #create key with samples ordered by same order as they are in the PCA
#
# # PCA plots with samples, color coded by treatments
# fviz_pca_biplot(diff.pca, axes = c(2,3), repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$infection_status, title = "PC2 ~ PC3, color=infection status")
# fviz_pca_biplot(diff.pca, axes = c(2,3),repel = TRUE, label = c("ind"), invisible = "var", select.var = list(contrib = 5), col.ind = pca.key$temperature, title = "PC2 ~ PC3, color=temperature")
# fviz_pca_biplot(diff.pca, axes = c(2,3),repel = TRUE, label = c("ind"), invisible = "var", col.ind = pca.key$day, title = "PC2 ~ PC3, color=day sampled")
#
# # PCA with samples + top 10 genes contributing to PC scores
# fviz_pca_biplot(diff.pca, axes = c(2,3),repel = TRUE, label = c("ind", "var"), select.var = list(contrib = 10))
# diff.pca.gene.data <- fviz_pca_var(diff.pca, axes = c(1,2), select.var = list(contrib = 100))$data
# head(diff.pca.gene.data)
# pca.sample.scores <- diff.pca$ind
# hist(unlist(as.data.frame(pca.sample.scores)[1:3], use.names=FALSE), main="Histogram of PC scores for dimensions 1, 2 & 3") #should have normal distribution for multivariate normality
The following creates a master dataframe from the differentially expressed genes with sample id, treatment info, and counts, in long format
data.frame(t(diffex.all.counts)) %>% you could easily swap in data.frame(t(counts(dds.larvae))) %>% go generate the same dataframe, but with all genes (not just differentially expressed ones)# counts4stats <- key %>%
# mutate(sample_stage=as.character(sample_stage)) %>%
# right_join(
# data.frame(t(diffex.all.counts)) %>%
# rownames_to_column("sample_stage")
# ) %>%
# pivot_longer(cols=starts_with("TRINITY"), values_to = "count", names_to = "gene")
# head(counts4stats)
In DESeq2, the function plotMA shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet. Points will be colored red if the padj<0.05. Points which fall out of the window are plotted as open triangles pointing either up or down.
# plotMA(res.all.status, main="DEG by infection status\nLog2 fold change ~ mean of normalized counts")
#
# plotMA(res.all.ColdVSWarm, main="DEGs between Cold & Warm\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.AmbVSWarm, main="DEGs between Ambient & Warm\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.ColdVSAmb, main="DEGs between Ambient & Cold\nLog2 fold change ~ mean of normalized counts")
#
# plotMA(res.all.9vs26, main="DEGs between Day 9 & Day 26\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.9vs12, main="DEGs between Day 9 & Day 12\nLog2 fold change ~ mean of normalized counts")
# plotMA(res.all.12vs26, main="DEGs between Day 12 & Day 26\nLog2 fold change ~ mean of normalized counts")
Shrinkage of effect size (LFC estimates) is useful for visualization and ranking of genes. To shrink the LFC, we pass the dds object to the function lfcShrink. Below we specify to use the apeglm method for effect size shrinkage (Zhu, Ibrahim, and Love 2018), which improves on the previous estimator.
We provide the dds object and the name or number of the coefficient we want to shrink, where the number refers to the order of the coefficient as it appears in resultsNames(dds).
lfcShrink() function to the dds.larvae.DESeq object, and specify coef=2 to specify that we want to examine DEGs between infection status by each shrinkage method.# resultsNames(dds.larvae.DESeq) # Check the order of coefficients to use in the lfcShrink function.
# # use `coef=2` to refers to infection status
#
# # Generate MA-plots after the different effect size shrinkage methods
# par(mfrow=c(1,3), mar=c(4,4,2,1))
# plotMA(lfcShrink(dds.larvae.DESeq, coef=2, type="apeglm"), main="apeglm")
# plotMA(lfcShrink(dds.larvae.DESeq, coef=2, type="normal"), main="normal")
# plotMA(lfcShrink(dds.larvae.DESeq, coef=2, type="ashr"), main="ashr")